condition by time point
only consider time points in KO village samples
library(Seurat)
library(Matrix)
library(dplyr)
library(tidyr)
library(ggplot2)
library(tibble)
library(cowplot)
library(patchwork)
library(stringr)
library(pheatmap)
library(reshape2)
library(readxl)
library(stringr)
library(DESeq2)
library(gridExtra)
library(ggrepel)
library(ggpubr)
library(tidyverse)
theme_set(theme_classic())
data_dir = "/home/zli4/MSK_DEC24/Analysis"
out_dir = "/fh/fast/sun_w/zhaoheng_li/MSK_DEC24/Analysis/outs/cell_type_composition"
set.seed(022520)lm_plot=function(mod){
summary_model= summary(mod)
coefficients= summary_model$coefficients
coeff_df= as.data.frame(coefficients)
coeff_df$Predictor= rownames(coeff_df)
significant_predictors= coeff_df %>%
filter(Predictor != "(Intercept)" & `Pr(>|t|)` < 0.05)
conf_int= confint(mod)
conf_df= as.data.frame(conf_int)
conf_df$Predictor= rownames(conf_df)
# Merge with the significant predictors
plot_data= merge(significant_predictors, conf_df, by = "Predictor")
names(plot_data)= c("Predictor", "Estimate", "Std.Error", "t.value", "p.value", "CI.Lower", "CI.Upper")
plot_data$Predictor = gsub(paste(c("genotype"), collapse = "|"), "", plot_data$Predictor)
fig = ggplot(plot_data, aes(x = reorder(Predictor, Estimate), y = Estimate)) +
geom_point() +
geom_errorbar(aes(ymin = CI.Lower, ymax = CI.Upper), width = 0.2) +
coord_flip() + # Flip the coordinates for better readability
theme_minimal() +
labs(
title = "Estimates and Confidence Intervals for Significant Predictors",
x = "",
y = "Estimate"
)
return(fig)
}
lm_plot_exp=function(mod){
summary_model= summary(mod)
coefficients= summary_model$coefficients
coeff_df= as.data.frame(coefficients)
coeff_df$Predictor= rownames(coeff_df)
significant_predictors= coeff_df %>%
filter(Predictor != "(Intercept)" & `Pr(>|t|)` < 0.05)
conf_int= confint(mod)
conf_df= as.data.frame(conf_int)
conf_df$Predictor= rownames(conf_df)
# Merge with the significant predictors
plot_data= merge(significant_predictors, conf_df, by = "Predictor")
names(plot_data)= c("Predictor", "Estimate", "Std.Error", "t.value", "p.value", "CI.Lower", "CI.Upper")
plot_data$Predictor = gsub(paste(c("genotype"), collapse = "|"), "", plot_data$Predictor)
fig = ggplot(plot_data, aes(x = reorder(Predictor, Estimate), y = exp(Estimate))) +
geom_point() +
geom_errorbar(aes(ymin = exp(CI.Lower), ymax = exp(CI.Upper)), width = 0.2) +
coord_flip() + # Flip the coordinates for better readability
theme_minimal() +
labs(
title = "Estimates and Confidence Intervals for Significant Predictors",
x = "",
y = "Estimate"
)
return(fig)
}integrated = readRDS(file.path(data_dir,"integrated-rpca_BDF-ref_withMeta.RDS"))# remove cells without cell type annotations
keep.cells= rownames(integrated@meta.data)[
!is.na(integrated@meta.data$celltype)
]
integrated= subset(
integrated,
cells = keep.cells
)
integrated$celltype = as.factor(integrated$celltype)
summary(integrated$celltype)## DE Ductal Endothelial EnP
## 16283 18880 1114 19845
## ESC ESC_DE Liver Mesenchymal_Stromal
## 15280 4559 9363 1698
## PFG PGT PP SC-alpha
## 8440 2627 14249 9684
## SC-beta SC-delta SC-EC Stromal
## 10152 1856 13743 3944
celltype_orders = c(
"ESC",
"ESC_DE",
"DE",
"PFG",
"PGT",
"PP",
"EnP",
"SC-EC",
"SC-alpha",
"SC-beta",
"SC-delta",
"Ductal",
"Mesenchymal_Stromal",
"Stromal",
"Endothelial",
"Liver"
)
integrated$celltype = factor(integrated$celltype,
levels =celltype_orders ,
ordered = TRUE)
tol_okabe16 <- c(
"#4477AA", "#EE6677", "#228833", "#CCBB44",
"#66CCEE", "#AA3377", "#BBBBBB", "#009E73",
"#E69F00", "#56B4E9", "#0072B2", "#D55E00",
"#CC79A7", "#117733", "#882255", "#DDCC77"
)
celltype_palette = setNames(
tol_okabe16,
celltype_orders
)
saveRDS(celltype_palette,"/fh/fast/sun_w/zhaoheng_li/MSK_DEC24/Analysis/outs/celltype_palette.RDS")
celltype_palette = readRDS("/fh/fast/sun_w/zhaoheng_li/MSK_DEC24/Analysis/outs/celltype_palette.RDS")obj = integrated%>%subset(subset = time_point%in%c("D-1","D3","D7","D11","D18"))
obj$time_point = droplevels(obj$time_point)
summary(obj$time_point)## D-1 D3 D7 D11 D18
## 15775 18602 16943 12052 36357
meta_df = obj[[]]
cell_counts = meta_df %>%
group_by(time_point, celltype, feature.4) %>%
summarise(
n_cells = n(),
.groups = "drop"
)
head(cell_counts)## # A tibble: 6 × 4
## time_point celltype feature.4 n_cells
## <fct> <ord> <chr> <int>
## 1 D-1 ESC 102_MNX1 164
## 2 D-1 ESC 103_MNX1 218
## 3 D-1 ESC 107_TET1/2/3 195
## 4 D-1 ESC 108_MNX1 152
## 5 D-1 ESC 109_TET1/2/3 54
## 6 D-1 ESC 110_WT 183
write.table(
cell_counts,
file = file.path(out_dir,"cell_counts.tsv"),
sep = "\t",
quote = FALSE,
row.names = FALSE
)table(obj$sample_name,obj$time_point)##
## D-1 D3 D7 D11 D18
## A 289 361 439 376 1560
## B 250 372 418 404 1602
## C 332 285 433 418 2344
## D 426 271 481 492 2588
## E 373 279 519 504 2060
## F 466 305 542 520 2558
## G 13639 0 0 0 0
## H 0 16729 0 0 0
## I 0 0 14111 0 0
## J 0 0 0 9338 0
## L 0 0 0 0 23645
table(obj$genotype,obj$time_point)##
## D-1 D3 D7 D11 D18
## ARX 201 506 586 350 462
## BCOR 123 157 170 118 611
## BMPR1A 235 265 83 87 219
## FOXA1 489 1291 1160 601 1349
## FOXA2 709 964 2045 276 276
## GATA4 473 495 106 104 255
## GATA4het 342 476 237 222 460
## GATA6 311 485 369 153 82
## GATA6het 542 719 233 206 333
## GLIS3 508 150 67 47 39
## GSC 376 563 194 146 266
## HHEX 549 300 150 26 20
## HHEXe 389 443 231 67 127
## HHEXhet 130 139 19 3 4
## HNF4A 141 194 120 148 309
## HNF4Ahet 121 142 95 172 265
## KDM2B 86 57 11 5 5
## MNX1 535 1166 1181 913 2035
## NANOGehet 235 205 97 95 286
## NEUROD1 307 479 384 321 1135
## NGN3 224 288 232 171 431
## NKX2-2 610 926 1048 1111 909
## ONECUT1e 655 378 416 175 375
## OTUD5 873 725 503 276 501
## PAX6 348 409 499 553 1014
## PBX1 253 78 112 98 328
## PDX1 738 623 495 553 3319
## PDX1het 320 29 14 5 7
## PROSER1 173 485 327 177 1081
## QSER1 248 377 147 71 282
## QSER1TET1 263 480 121 38 123
## RFX6 610 754 750 281 575
## TADA2B 212 131 146 171 387
## TET1 165 385 293 175 245
## TET1/2/3 256 119 90 83 247
## TLE3 227 357 386 347 1506
## WT 2798 2862 3826 3707 16489
table(obj$celltype,obj$time_point)##
## D-1 D3 D7 D11 D18
## ESC 13613 1152 209 102 62
## ESC_DE 2095 2427 13 20 0
## DE 17 14964 566 67 100
## PFG 1 5 7520 620 75
## PGT 0 0 2 0 1
## PP 4 0 46 7871 512
## EnP 6 5 8 1298 4328
## SC-EC 6 0 2 11 7093
## SC-alpha 7 2 0 58 4722
## SC-beta 0 2 2 2 5402
## SC-delta 0 1 0 25 749
## Ductal 3 1 37 193 11228
## Mesenchymal_Stromal 21 38 20 258 76
## Stromal 1 5 1261 961 1332
## Endothelial 0 0 815 254 7
## Liver 1 0 6442 312 670
meta = obj[[]]# ignore time points unique to external WT samples
common_times = rownames(table(meta$time_point,meta$source))[rowSums(table(meta$time_point,meta$source)!=0)==2]
prop_by_time = meta %>%
filter(time_point %in% common_times) %>%
group_by(time_point, celltype) %>%
summarise(n = n(), .groups = "drop_last") %>%
mutate(prop = n / sum(n)) %>% # within each time_point
ungroup()
fig1= prop_by_time%>%ggplot(aes(x = time_point, y = prop, fill = celltype)) +
geom_col() +
scale_y_continuous(labels = scales::percent_format(1)) +
scale_fill_manual(
values = celltype_palette,
drop = TRUE
)+
labs(
title = "cell-type proportions by time point",
x = "",
y = "",
fill = ""
) +
guides(fill = guide_legend(nrow = 3, byrow = TRUE)) +
theme_classic() +
theme(
legend.position = "top", # ← move legend to top
legend.direction = "horizontal", # ← ensure it’s laid out horizontally
axis.text.x = element_text(angle = 45, hjust = 1)
)
fig1ggsave(
filename = file.path(out_dir,"celltype_prop_time.png"),
plot = fig1,
width = 8,
height = 4,
units = "in",
dpi = 300
)prop_by_genotype = meta %>%
group_by(genotype, celltype) %>%
summarise(n = n(), .groups = "drop_last") %>%
mutate(prop = n / sum(n)) %>% # within each genotype
ungroup()
fig2 = prop_by_genotype %>%
ggplot(aes(x = genotype, y = prop, fill = celltype)) +
geom_col() +
scale_y_continuous(labels = scales::percent_format(1)) +
scale_fill_manual(
values = celltype_palette,
drop = TRUE
)+
labs(
title = "cell-type proportions by genotype",
x = "",
y = "",
fill = ""
) +
guides(fill = guide_legend(nrow = 3, byrow = TRUE)) +
theme_classic() +
theme(
legend.position = "top", # ← move legend to top
legend.direction = "horizontal", # ← ensure it’s laid out horizontally
axis.text.x = element_text(angle = 45, hjust = 1)
)
ggsave(
filename = file.path(out_dir,"celltype_prop_genotype.png"),
plot = fig2,
width = 8,
height = 4,
units = "in",
dpi = 300
)
fig2obj_list = SplitObject(obj, split.by = "time_point")
obj_list## $D18
## An object of class Seurat
## 36390 features across 36357 samples within 1 assay
## Active assay: RNA (36390 features, 2000 variable features)
## 3 layers present: data, counts, scale.data
## 4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
##
## $D7
## An object of class Seurat
## 36390 features across 16943 samples within 1 assay
## Active assay: RNA (36390 features, 2000 variable features)
## 3 layers present: data, counts, scale.data
## 4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
##
## $D3
## An object of class Seurat
## 36390 features across 18602 samples within 1 assay
## Active assay: RNA (36390 features, 2000 variable features)
## 3 layers present: data, counts, scale.data
## 4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
##
## $D11
## An object of class Seurat
## 36390 features across 12052 samples within 1 assay
## Active assay: RNA (36390 features, 2000 variable features)
## 3 layers present: data, counts, scale.data
## 4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
##
## $`D-1`
## An object of class Seurat
## 36390 features across 15775 samples within 1 assay
## Active assay: RNA (36390 features, 2000 variable features)
## 3 layers present: data, counts, scale.data
## 4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
prop_lst = lapply(c("D3","D7","D11","D18"),function(tp){
df = obj_list[[tp]][[]]
df %>% group_by(genotype, celltype) %>%
summarise(count = n()) %>%
mutate(proportion = count / sum(count)) %>%
arrange(genotype, desc(proportion))
})
names(prop_lst) = c("D3","D7","D11","D18")fig_list = lapply( c("D3","D7","D11","D18"),function(tp){
df = prop_lst[[tp]]
df %>%
ggplot(aes(x = genotype, y = proportion, fill = celltype)) +
geom_col() +
scale_y_continuous(labels = scales::percent_format(1)) +
scale_fill_manual(
values = celltype_palette,
drop = TRUE
)+
labs(
title = tp,
x = "",
y = "cell-type proportions",
fill = ""
) +
guides(fill = guide_legend(nrow = 3, byrow = TRUE)) +
theme_classic() +
theme(
legend.position = "top", # ← move legend to top
legend.direction = "horizontal", # ← ensure it’s laid out horizontally
axis.text.x = element_text(angle = 45, hjust = 1)
)
})
names(fig_list) = c("D3","D7","D11","D18")
for(tp in names(fig_list)){
ggsave(file.path(out_dir,paste0("genotype_celltype_proportion/",tp,".png",sep='')),
plot = fig_list[[tp]],
width = 8,
height = 6,
units = "in",
dpi = 300
)
}
fig_list## $D3
##
## $D7
##
## $D11
##
## $D18
using centering log ratio transformation
fig_list = list()
for(tp in names(prop_lst)){
df = prop_lst[[tp]]
df$celltype = as.factor(df$celltype)
levels(df$celltype)=gsub("_","-",levels(df$celltype)) # rename celltype levels
df = df%>%
group_by(genotype)%>%
mutate(geom_mean = exp(mean(log(proportion))))%>%
mutate(clr = log(proportion/geom_mean))
df$celltype = droplevels(df$celltype)
print(unique(df$celltype))
clr = df %>%
select(genotype, celltype, clr) %>%
tidyr::spread(key = celltype, value = clr) %>%
tibble::column_to_rownames(var = "genotype") %>%
as.matrix()
clr[is.na(clr)] = 2*min(clr,na.rm=T) # deal with NA values
clr_dist = as.matrix(dist(clr, method = "euclidean"))
hc = hclust(as.dist(clr_dist))
d = mean( clr_dist)
fig = pheatmap::pheatmap(exp(-clr_dist^2/(2*d)),
clustering_method = 'ward.D',
cluster_rows = hc,cluster_cols =hc,main = tp,fontsize = 5)
fig_list[[tp]] = fig[[4]]
}## [1] DE ESC-DE ESC
## [4] Mesenchymal-Stromal PFG EnP
## [7] Stromal SC-alpha SC-beta
## [10] SC-delta Ductal
## 11 Levels: ESC < ESC-DE < DE < PFG < EnP < SC-alpha < SC-beta < ... < Stromal
## [1] Liver PFG Stromal
## [4] DE Endothelial ESC
## [7] Ductal ESC-DE Mesenchymal-Stromal
## [10] PP PGT EnP
## [13] SC-EC SC-beta
## 14 Levels: ESC < ESC-DE < DE < PFG < PGT < PP < EnP < SC-EC < ... < Liver
## [1] PP EnP Stromal
## [4] PFG Endothelial Liver
## [7] DE SC-delta Mesenchymal-Stromal
## [10] Ductal ESC ESC-DE
## [13] SC-alpha SC-EC SC-beta
## 15 Levels: ESC < ESC-DE < DE < PFG < PP < EnP < SC-EC < ... < Liver
## [1] Ductal EnP SC-beta
## [4] SC-EC Stromal SC-delta
## [7] Liver PP SC-alpha
## [10] DE ESC PFG
## [13] Mesenchymal-Stromal Endothelial PGT
## 15 Levels: ESC < DE < PFG < PGT < PP < EnP < SC-EC < SC-alpha < ... < Liver
g = gridExtra::grid.arrange(gridExtra::arrangeGrob(grobs= fig_list,ncol=2))ggsave(file.path(out_dir,"geno_similarity_cell-type-composition.png"),g)
g## TableGrob (1 x 1) "arrange": 1 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
f_pseudobulk_list = file.path(out_dir,"ct_composition_pseudobulk_list.RDS")
if(file.exists(f_pseudobulk_list)){
pseudo_obj_list = readRDS(f_pseudobulk_list)
}else{
pseudo_obj_list = list()
for(tp in c("D-1","D3","D7","D11","D18")){
print(paste0("working on ", tp))
obj = obj_list[[tp]]
obj$background = as.factor(obj$background)
obj$genotype = as.factor(obj$genotype)
levels(obj$celltype)=gsub("_","-",levels(obj$celltype)) # rename celltype levels
obj$celltype = droplevels(obj$celltype)
obj$pseudo.sample.id = as.factor(paste(obj$sample_name,obj$background,obj$genotype,obj$feature.4,sep='_')) # handle cell type names
pseudo_obj= AggregateExpression(obj, assays = "RNA", return.seurat = T,
group.by = c("sample_name","background","genotype", "feature.4"))
obj$pseudo.sample.id = gsub("-","_",obj$pseudo.sample.id)
pseudo_obj$pseudo.sample.id = gsub("-","_", colnames(pseudo_obj))
pseudo_meta = pseudo_obj[[]]
df = obj[[]] %>%
group_by(pseudo.sample.id, celltype) %>%
summarise(count = n(),.groups = "drop") %>%
tidyr::complete(pseudo.sample.id, celltype, fill = list(count = 0))%>%
group_by(pseudo.sample.id) %>%
mutate(proportion = count / sum(count))
df_wide = df%>%tidyr::pivot_wider(names_from = celltype, values_from = c(count,proportion))
pseudo_meta = pseudo_meta %>%left_join(df_wide)
rownames(pseudo_meta) = pseudo_meta$orig.ident
n.df=obj[[]]%>%dplyr::count(pseudo.sample.id)
pseudo_meta = pseudo_meta %>%left_join(n.df)
rownames(pseudo_meta) = pseudo_meta$orig.ident
pseudo_obj@meta.data = pseudo_meta
pseudo_obj$source =as.factor(ifelse(pseudo_obj$sample_name%in%c("A","B","C","D","E","F"),"external_WT", "KO_village" ))
pseudo_obj_list[[tp]] = pseudo_obj
}
saveRDS(pseudo_obj_list,f_pseudobulk_list)
}names(celltype_palette) = gsub("_","-",names(celltype_palette))for(tp in c("D-1","D3","D7","D11","D18")){
print(tp)
df = pseudo_obj_list[[tp]][[]]
print(summary(as.factor(df$sample_name)))
}## [1] "D-1"
## A B C D E F G
## 2 2 2 2 2 2 81
## [1] "D3"
## A B C D E F H
## 2 2 2 2 2 2 81
## [1] "D7"
## A B C D E F I
## 2 2 2 2 2 2 81
## [1] "D11"
## A B C D E F J
## 2 2 2 2 2 2 80
## [1] "D18"
## A B C D E F L
## 1 1 1 1 1 1 80
min_cells = 30pseudo_obj_list = lapply(pseudo_obj_list,function(pseudo_obj){
pseudo_obj%>%subset(subset = n>=min_cells)
})
pseudo_obj_list## $`D-1`
## An object of class Seurat
## 36390 features across 93 samples within 1 assay
## Active assay: RNA (36390 features, 0 variable features)
## 3 layers present: counts, data, scale.data
##
## $D3
## An object of class Seurat
## 36390 features across 86 samples within 1 assay
## Active assay: RNA (36390 features, 0 variable features)
## 3 layers present: counts, data, scale.data
##
## $D7
## An object of class Seurat
## 36390 features across 84 samples within 1 assay
## Active assay: RNA (36390 features, 0 variable features)
## 3 layers present: counts, data, scale.data
##
## $D11
## An object of class Seurat
## 36390 features across 72 samples within 1 assay
## Active assay: RNA (36390 features, 0 variable features)
## 3 layers present: counts, data, scale.data
##
## $D18
## An object of class Seurat
## 36390 features across 73 samples within 1 assay
## Active assay: RNA (36390 features, 0 variable features)
## 3 layers present: counts, data, scale.data
barplot_list = list()
for(tp in c("D3","D7","D11","D18")){
pseudo_obj = pseudo_obj_list[[tp]]
df = pseudo_obj[[]]
df_long= df %>%
tidyr::pivot_longer(cols = starts_with("proportion_"),
names_to = "celltype",
values_to = "Proportion")
df_long$celltype = gsub("proportion_","",df_long$celltype)
df_long = df_long %>%
mutate(
celltype = factor(celltype, levels = gsub("_","-",celltype_orders),ordered = TRUE)
)
fig = ggplot(df_long, aes(x = pseudo.sample.id, y = Proportion, fill = celltype)) +
geom_col(position = "stack") +
scale_fill_manual(
values =celltype_palette,
drop = TRUE
)+
facet_wrap(~ genotype,
nrow = 2,
scales = "free_x",
strip.position = "top") +
ylab("cell type proportion") +
xlab("pseudobulk samples") +
labs(
title = tp,
x = "pseudobulk samples",
y = "cell type proportion",
fill = ""
) +
guides(fill = guide_legend(nrow = 2, byrow = TRUE)) +
theme_pubr() +
theme(
strip.text = element_text(angle = 90, size = 8, hjust = 0, face = "bold"),
strip.background = element_rect(colour = "black", fill = "grey95", linewidth = 0),
axis.text.x = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank()
)
barplot_list[[tp]] = fig
}
for(tp in names(barplot_list)){
ggsave(file.path(out_dir,paste0("pseudobulk_celltype_proportion/",tp,".png",sep='')),
plot = barplot_list[[tp]],
width = 12,
height = 10,
units = "in",
dpi = 300
)
}
barplot_list## $D3
##
## $D7
##
## $D11
##
## $D18
for( tp in c("D3","D7","D11","D18")){
obj = pseudo_obj_list[[tp]]
df = obj[[]]
ct_response = gsub("count_","",grep("^count_", names(df), value = TRUE))
n = nrow(df)
# create log proportions
for(ct in ct_response){
df[,paste("proportion0",ct,sep="_")] = df[,paste("proportion",ct,sep="_")]*(n-1)/n+0.5/n # transformation used by DR_data
df[[paste("log_prop", ct,sep="_")]] = log(df[,paste("proportion0",ct,sep="_")] )
print(paste(ct, " # Inf: ", sum(is.infinite(df[[paste("log_prop", ct,sep="_")]] )),sep=''))
}
for(c in grep("^log_prop_", colnames(df), value = TRUE)){
hist(as.vector(df[c])[[1]],main = paste(tp, c,sep=" : "),xlab = "log proportion")
}
}## [1] "ESC # Inf: 0"
## [1] "ESC-DE # Inf: 0"
## [1] "DE # Inf: 0"
## [1] "PFG # Inf: 0"
## [1] "EnP # Inf: 0"
## [1] "SC-alpha # Inf: 0"
## [1] "SC-beta # Inf: 0"
## [1] "SC-delta # Inf: 0"
## [1] "Ductal # Inf: 0"
## [1] "Mesenchymal-Stromal # Inf: 0"
## [1] "Stromal # Inf: 0"
## [1] "ESC # Inf: 0"
## [1] "ESC-DE # Inf: 0"
## [1] "DE # Inf: 0"
## [1] "PFG # Inf: 0"
## [1] "PGT # Inf: 0"
## [1] "PP # Inf: 0"
## [1] "EnP # Inf: 0"
## [1] "SC-EC # Inf: 0"
## [1] "SC-beta # Inf: 0"
## [1] "Ductal # Inf: 0"
## [1] "Mesenchymal-Stromal # Inf: 0"
## [1] "Stromal # Inf: 0"
## [1] "Endothelial # Inf: 0"
## [1] "Liver # Inf: 0"
## [1] "ESC # Inf: 0"
## [1] "ESC-DE # Inf: 0"
## [1] "DE # Inf: 0"
## [1] "PFG # Inf: 0"
## [1] "PP # Inf: 0"
## [1] "EnP # Inf: 0"
## [1] "SC-EC # Inf: 0"
## [1] "SC-alpha # Inf: 0"
## [1] "SC-beta # Inf: 0"
## [1] "SC-delta # Inf: 0"
## [1] "Ductal # Inf: 0"
## [1] "Mesenchymal-Stromal # Inf: 0"
## [1] "Stromal # Inf: 0"
## [1] "Endothelial # Inf: 0"
## [1] "Liver # Inf: 0"
## [1] "ESC # Inf: 0"
## [1] "DE # Inf: 0"
## [1] "PFG # Inf: 0"
## [1] "PGT # Inf: 0"
## [1] "PP # Inf: 0"
## [1] "EnP # Inf: 0"
## [1] "SC-EC # Inf: 0"
## [1] "SC-alpha # Inf: 0"
## [1] "SC-beta # Inf: 0"
## [1] "SC-delta # Inf: 0"
## [1] "Ductal # Inf: 0"
## [1] "Mesenchymal-Stromal # Inf: 0"
## [1] "Stromal # Inf: 0"
## [1] "Endothelial # Inf: 0"
## [1] "Liver # Inf: 0"
fig_list =list()
coef_list = list()
p_list = list()
for( tp in c("D3","D7","D11","D18")){
df = pseudo_obj_list[[tp]][[]]
ct_response = gsub("count_","",grep("^count_", names(df), value = TRUE))
n = nrow(df)
# create log proportions
for(ct in ct_response){
df[,paste("proportion0",ct,sep="_")] = df[,paste("proportion",ct,sep="_")]*(n-1)/n+0.5/n # transformation used by DR_data
df[[paste("log_prop", ct,sep="_")]] = log(df[,paste("proportion0",ct,sep="_")] )
}
mod_tp = list()
coef_mtx = list()
p_mtx = list()
for(ct in ct_response){
# run regression
if(max(df[,paste("proportion",ct,sep="_")])>0.03){
df$genotype = as.factor(df$genotype)
df$genotype = relevel(df$genotype, ref = "WT")
v = as.symbol(paste("log_prop", ct,sep="_"))
mod = eval(bquote(lm(.(v)~source+background+genotype,data=df)))
mod_tp[[ct]] = mod
coef_mtx[[ct]] = summary(mod)$coefficients[grepl("genotype", names(summary(mod)$coefficients[,4])),1]
p_mtx[[ct]] = summary(mod)$coefficients[grepl("genotype", names(summary(mod)$coefficients[,4])),4]
}
}
coef_mtx = do.call(cbind,coef_mtx);rownames(coef_mtx) = gsub("genotype","",rownames(coef_mtx) )
p_mtx = do.call(cbind,p_mtx);rownames(p_mtx) = gsub("genotype","",rownames(p_mtx) )
coef_list[[tp]] = coef_mtx
p_list[[tp]] = p_mtx
# visualize results
for(ct in names(mod_tp)){
if(any(p_mtx[,ct]<=0.05)){
mod = mod_tp[[ct]]
fig = lm_plot(mod)+
ggtitle(paste(tp, ":", ct,sep=''))+
theme(plot.title = element_text(size = 10, face = "bold"))
fig_list[[paste(tp,ct,sep="_")]] = fig
}
}
}only cell types with at least one genotype as a significant predictors are shown
coef_fig = gridExtra::grid.arrange(grobs = fig_list,ncol = 6,top = "Significant Coefficients on cell-type log(proportion)")print(coef_fig)## TableGrob (7 x 6) "arrange": 33 grobs
## z cells name grob
## D3_ESC 1 (2-2,1-1) arrange gtable[layout]
## D3_ESC-DE 2 (2-2,2-2) arrange gtable[layout]
## D3_DE 3 (2-2,3-3) arrange gtable[layout]
## D7_ESC 4 (2-2,4-4) arrange gtable[layout]
## D7_ESC-DE 5 (2-2,5-5) arrange gtable[layout]
## D7_DE 6 (2-2,6-6) arrange gtable[layout]
## D7_PFG 7 (3-3,1-1) arrange gtable[layout]
## D7_Ductal 8 (3-3,2-2) arrange gtable[layout]
## D7_Stromal 9 (3-3,3-3) arrange gtable[layout]
## D7_Endothelial 10 (3-3,4-4) arrange gtable[layout]
## D7_Liver 11 (3-3,5-5) arrange gtable[layout]
## D11_ESC 12 (3-3,6-6) arrange gtable[layout]
## D11_ESC-DE 13 (4-4,1-1) arrange gtable[layout]
## D11_PFG 14 (4-4,2-2) arrange gtable[layout]
## D11_PP 15 (4-4,3-3) arrange gtable[layout]
## D11_EnP 16 (4-4,4-4) arrange gtable[layout]
## D11_SC-alpha 17 (4-4,5-5) arrange gtable[layout]
## D11_Ductal 18 (4-4,6-6) arrange gtable[layout]
## D11_Mesenchymal-Stromal 19 (5-5,1-1) arrange gtable[layout]
## D11_Stromal 20 (5-5,2-2) arrange gtable[layout]
## D11_Endothelial 21 (5-5,3-3) arrange gtable[layout]
## D11_Liver 22 (5-5,4-4) arrange gtable[layout]
## D18_ESC 23 (5-5,5-5) arrange gtable[layout]
## D18_PP 24 (5-5,6-6) arrange gtable[layout]
## D18_EnP 25 (6-6,1-1) arrange gtable[layout]
## D18_SC-EC 26 (6-6,2-2) arrange gtable[layout]
## D18_SC-alpha 27 (6-6,3-3) arrange gtable[layout]
## D18_SC-beta 28 (6-6,4-4) arrange gtable[layout]
## D18_SC-delta 29 (6-6,5-5) arrange gtable[layout]
## D18_Ductal 30 (6-6,6-6) arrange gtable[layout]
## D18_Stromal 31 (7-7,1-1) arrange gtable[layout]
## D18_Liver 32 (7-7,2-2) arrange gtable[layout]
## 33 (1-1,1-6) arrange text[GRID.text.12718]
ggsave(file.path(out_dir,"log_proportion_lm_coef.png"),coef_fig,width = 15,height=15)values are truncated at \(\pm 3\) for visualization purpose
meta%>%filter(time_point=='D3')%>%group_by(celltype)%>%summarise(n=n())## # A tibble: 11 × 2
## celltype n
## <ord> <int>
## 1 ESC 1152
## 2 ESC_DE 2427
## 3 DE 14964
## 4 PFG 5
## 5 EnP 5
## 6 SC-alpha 2
## 7 SC-beta 2
## 8 SC-delta 1
## 9 Ductal 1
## 10 Mesenchymal_Stromal 38
## 11 Stromal 5
htmap_list = list()
for(tp in names(coef_list)){
coef_mtx =coef_list[[tp]]
coef_mtx[coef_mtx>3] = 3;coef_mtx[coef_mtx=3] = -3
htmap = pheatmap::pheatmap(coef_mtx,cluster_cols = F,fontsize = 5,
color = colorRampPalette(c("blue", "white", "red"))(50),
breaks = seq(min(coef_mtx,na.rm=T), max(coef_mtx,na.rm=T), length.out = 51),
main = tp)
htmap_list[[tp]] = htmap
}names(htmap_list) = names(coef_list)
for(tp in names(htmap_list)){
png(file.path(out_dir,paste0("coefs_htmaps/",tp,".png",sep='')),width = 8,height = 6,unit = "in",res = 1000)
print(htmap_list[[tp]])
dev.off()
print(htmap)
}dotplot_df = imap_dfr(p_list, function(pmat, tp) {
cmat = coef_list[[tp]]
as_tibble(pmat, rownames="genotype") %>%
pivot_longer(-genotype, names_to="cell_type", values_to="pval") %>%
left_join(
as_tibble(cmat, rownames="genotype") %>%
pivot_longer(-genotype, names_to="cell_type", values_to="coef"),
by = c("genotype","cell_type")
) %>%
mutate(time_point = tp,
neglog10_p = -log10(pval))
})
dotplot_df = dotplot_df %>%
mutate(time_point = factor(time_point,
levels = c("D3","D7","D11","D18"),
ordered = TRUE))dotplot_list = map(c("D3","D7","D11","D18"), function(tp) {
coef_mtx = coef_list[[tp]]
genotype_order = rownames(coef_mtx)[ htmap_list[[tp]]$tree_row$order ]
df_tp = filter(dotplot_df, time_point == tp) |>
mutate(
genotype = factor(genotype,
levels = genotype_order,
ordered = TRUE),
cell_type = factor(cell_type,
levels = celltype_orders,
ordered = TRUE)
) |>
drop_na(genotype, cell_type) # drops cell types not present
ggplot(df_tp,
aes(cell_type, genotype,
size = neglog10_p,
color = coef)) +
geom_point() +
scale_size(range = c(0.5, 6),
name = expression(-log[10]~italic(p))) +
scale_color_gradient2(low = "#2166AC",
mid = "white",
high = "#B2182B",
midpoint = 0,
name = "Coef") +
labs(title = tp, x = NULL, y = NULL) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5))
})
names(dotplot_list) = c("D3","D7","D11","D18")
dotplot_list## $D3
##
## $D7
##
## $D11
##
## $D18
for(tp in names(dotplot_list)){
ggsave(file.path(out_dir, paste0("coefs_dotplots/", tp, ".png",sep='')),
plot = dotplot_list[[tp]],
width = 6,height = 9,
unit = "in",
dpi = 300)
}
pdf(file.path(out_dir,"coefs_dotplots/coefs_dotplots.pdf"), width = 6, height = 9)
for(tp in names(dotplot_list)){
print(dotplot_list[[tp]])
}
dev.off()## png
## 2
# take the order from the heat-map dendrogram of *one* reference TP.
genotype_order = rownames(coef_list[["D18"]])[ htmap_list[["D18"]]$tree_row$order ]
tp_levels = levels(dotplot_df$time_point)
tp_ct_levels = unlist(
lapply(tp_levels, \(tp) paste(tp, celltype_orders, sep = "_"))
)
plot_df = dotplot_df %>%
mutate(genotype = factor(genotype,
levels = genotype_order,
ordered = TRUE),
tp_ct = factor(paste(time_point, cell_type, sep = "_"),
levels = tp_ct_levels,
ordered = TRUE)) %>%
drop_na(genotype, tp_ct)
dotplot = plot_df%>%
ggplot(aes(x = tp_ct,
y = genotype,
size = neglog10_p,
colour = coef)) +
geom_point() +
scale_size(range = c(0.3, 4),
name = expression(-log[10]~italic(p))) +
scale_colour_gradient2(low = "#2166AC",
mid = "white",
high = "#B2182B",
midpoint = 0,
name = "Coef") +
labs(x = NULL, y = NULL) +
theme_bw(base_size = 8) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
panel.grid.major.x = element_blank(), # optional: declutter grid
plot.title = element_text(hjust = 0.5))
dotplotggsave(file.path(out_dir,"coefs_dotplots/all.png"),
plot = dotplot,
width = 5.5,height = 6,
unit = "in",
dpi = 300)gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 8535650 455.9 14173958 757.0 14173958 757.0
## Vcells 3271527810 24959.8 7505063610 57259.1 6139819429 46843.2
sessionInfo()## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/Los_Angeles
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] lubridate_1.9.3 forcats_1.0.0
## [3] purrr_1.0.2 readr_2.1.5
## [5] tidyverse_2.0.0 ggpubr_0.6.0
## [7] ggrepel_0.9.5 gridExtra_2.3
## [9] DESeq2_1.44.0 SummarizedExperiment_1.34.0
## [11] Biobase_2.64.0 MatrixGenerics_1.16.0
## [13] matrixStats_1.3.0 GenomicRanges_1.56.0
## [15] GenomeInfoDb_1.40.0 IRanges_2.38.0
## [17] S4Vectors_0.42.0 BiocGenerics_0.50.0
## [19] readxl_1.4.3 reshape2_1.4.4
## [21] pheatmap_1.0.12 stringr_1.5.1
## [23] patchwork_1.2.0 cowplot_1.1.3
## [25] tibble_3.2.1 ggplot2_3.5.1
## [27] tidyr_1.3.1 dplyr_1.1.4
## [29] Matrix_1.7-0 Seurat_5.1.0
## [31] SeuratObject_5.0.2 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.22 splines_4.4.0 later_1.3.2
## [4] cellranger_1.1.0 polyclip_1.10-6 fastDummies_1.7.3
## [7] lifecycle_1.0.4 rstatix_0.7.2 globals_0.16.3
## [10] lattice_0.22-6 MASS_7.3-60.2 backports_1.4.1
## [13] magrittr_2.0.3 plotly_4.10.4 sass_0.4.9
## [16] rmarkdown_2.26 jquerylib_0.1.4 yaml_2.3.8
## [19] httpuv_1.6.15 sctransform_0.4.1 spam_2.10-0
## [22] spatstat.sparse_3.0-3 reticulate_1.36.1 pbapply_1.7-2
## [25] RColorBrewer_1.1-3 abind_1.4-5 zlibbioc_1.50.0
## [28] Rtsne_0.17 GenomeInfoDbData_1.2.12 irlba_2.3.5.1
## [31] listenv_0.9.1 spatstat.utils_3.0-4 goftest_1.2-3
## [34] RSpectra_0.16-1 spatstat.random_3.2-3 fitdistrplus_1.2-1
## [37] parallelly_1.37.1 leiden_0.4.3.1 codetools_0.2-20
## [40] DelayedArray_0.30.1 tidyselect_1.2.1 UCSC.utils_1.0.0
## [43] farver_2.1.2 spatstat.explore_3.2-7 jsonlite_1.8.8
## [46] progressr_0.14.0 ggridges_0.5.6 survival_3.6-4
## [49] systemfonts_1.1.0 tools_4.4.0 ragg_1.3.1
## [52] ica_1.0-3 Rcpp_1.0.12 glue_1.7.0
## [55] SparseArray_1.4.3 xfun_0.44 withr_3.0.0
## [58] fastmap_1.2.0 fansi_1.0.6 digest_0.6.35
## [61] timechange_0.3.0 R6_2.5.1 mime_0.12
## [64] textshaping_0.3.7 colorspace_2.1-0 scattermore_1.2
## [67] tensor_1.5 spatstat.data_3.0-4 utf8_1.2.4
## [70] generics_0.1.3 data.table_1.15.4 httr_1.4.7
## [73] htmlwidgets_1.6.4 S4Arrays_1.4.0 uwot_0.2.2
## [76] pkgconfig_2.0.3 gtable_0.3.5 lmtest_0.9-40
## [79] XVector_0.44.0 htmltools_0.5.8.1 carData_3.0-5
## [82] dotCall64_1.1-1 scales_1.3.0 png_0.1-8
## [85] knitr_1.46 rstudioapi_0.16.0 tzdb_0.4.0
## [88] nlme_3.1-164 cachem_1.0.8 zoo_1.8-12
## [91] KernSmooth_2.23-22 parallel_4.4.0 miniUI_0.1.1.1
## [94] pillar_1.9.0 grid_4.4.0 vctrs_0.6.5
## [97] RANN_2.6.1 promises_1.3.0 car_3.1-2
## [100] xtable_1.8-4 cluster_2.1.6 evaluate_0.23
## [103] cli_3.6.2 locfit_1.5-9.9 compiler_4.4.0
## [106] rlang_1.1.3 crayon_1.5.2 future.apply_1.11.2
## [109] ggsignif_0.6.4 labeling_0.4.3 plyr_1.8.9
## [112] stringi_1.8.4 viridisLite_0.4.2 deldir_2.0-4
## [115] BiocParallel_1.38.0 munsell_0.5.1 lazyeval_0.2.2
## [118] spatstat.geom_3.2-9 RcppHNSW_0.6.0 hms_1.1.3
## [121] future_1.33.2 shiny_1.8.1.1 highr_0.10
## [124] ROCR_1.0-11 igraph_2.0.3 broom_1.0.5
## [127] bslib_0.7.0